Show the code
library(tidyverse)
library(knitr)
library(ggVennDiagram)library(tidyverse)
library(knitr)
library(ggVennDiagram)Our data that we will work with today can be found by searching metagenome bins associated with GOLD Study ID Gs0161344 IMG/MER
A description of the column headers for the file we will work with
Load the MAG table with modifications
NEON_MAGs <- read_tsv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2025_3_18.tsv") |>
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) |>
# create a new column with the Assembly Type
mutate("Assembly Type" = `Genome Name`) |>
mutate("Assembly Type" = case_when(`Assembly Type` == "NEON combined assembly" | `Assembly Type` == "WREF organic layer samples" | `Assembly Type` == "WREF mineral layer samples" | `Assembly Type` == "WREF site samples" | `Assembly Type` == "WREF plot 73 samples" | `Assembly Type` == "WREF plot 4 samples" | `Assembly Type` == "WREF plot 3 samples" ~ `Assembly Type`, TRUE ~ "Individual")) |>
mutate_at("Assembly Type", str_replace, "WREF organic layer samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "WREF mineral layer samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "WREF site samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "WREF plot 3 samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "WREF plot 4 samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "WREF plot 73 samples", "Coassembly") |>
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Coassembly") |>
mutate("Assembly Method" = case_when(`Genome Name` == "NEON combined assembly" | `Genome Name` == "WREF site samples" ~ `Genome Name`,
TRUE ~ "MetaSpades")) |>
mutate_at("Assembly Method", str_replace, "NEON combined assembly", "MetaHipMer") |>
mutate_at("Assembly Method", str_replace, "WREF site samples", "MetaHipMer") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "d__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "p__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "c__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "o__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "f__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "g__", "") |>
mutate_at("GTDB Taxonomy Lineage", str_replace, "s__", "") |>
separate(`GTDB Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) |>
mutate_at("Domain", na_if,"") |>
mutate_at("Phylum", na_if,"") |>
mutate_at("Class", na_if,"") |>
mutate_at("Order", na_if,"") |>
mutate_at("Family", na_if,"") |>
mutate_at("Genus", na_if,"") |>
mutate_at("Species", na_if,"") |>
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") |>
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") |>
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") |>
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) |>
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") |>
mutate(`Sample Name` = coalesce(`Sample Name`, Site)) |>
select(-bin_oid)From IMG create table from IMG with the metagenome information including Ecosystem subtype, latitude, longitude, Genome Size, Gene Count. Just display the table in your Lab 10 report. We will work more with this table next week.
To do this
Quick Genome Search with the Study ID Gs0161344 to find metagenomes associated with our project.176 Hits.RedisplaySelect All (There should be 176 metagenomes)Exportread_tsv because although it says it is a csv file it is actually a tsv (tab separated value) file.re-annotationNEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") |>
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) |>
rename(`Genome Name` = `Genome Name / Sample Name`) |>
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) |>
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") |>
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") |>
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") |>
mutate(`Sample Name` = coalesce(`Sample Name`, Site)) |>
select(-c(Site, `IMG Genome ID`)) NEON_MAGs_metagenomes <- NEON_MAGs |>
left_join(NEON_metagenomes, by = "Sample Name") NEON_MAGs_WREF <- NEON_MAGs_metagenomes |>
filter(str_detect(Site, 'WREF') | str_detect(Site, "Wind River"))NEON_MAGs_87 <- NEON_MAGs_metagenomes |>
filter(`Sample Name` == "NEON combined assembly" | `Assembly Type` == "Individual")NEON_MAGs_WREF |>
ggplot(aes(x = `Sample Name`, fill = Phylum)) +
geom_bar()+
coord_flip() NEON_MAGs_WREF |>
distinct(`GTDB Taxonomy Lineage`) |>
count()# A tibble: 1 × 1
n
<int>
1 82
NEON_MAGs_WREF |>
distinct(`GTDB Taxonomy Lineage`, `Sample Name`, Phylum) |>
ggplot(aes(x = `Sample Name`, fill = Phylum)) +
geom_bar()+
coord_flip()Set1 <- NEON_MAGs_WREF |>
filter(Subplot == "003")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF plot 3 samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA")) Set1 <- NEON_MAGs_WREF |>
filter(Subplot == "004")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF plot 4 samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA")) Set1 <- NEON_MAGs_WREF |>
filter(Subplot == "073")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF plot 73 samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA")) Set1 <- NEON_MAGs_WREF |>
filter(Layer == "O")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF organic layer samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA"))Set1 <- NEON_MAGs_WREF |>
filter(Layer == "M")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF mineral layer samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA"))Set1 <- NEON_MAGs_WREF |>
filter(`Assembly Type` == "Individual")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF site samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA"))Set1 <- NEON_MAGs_WREF |>
filter(Site == "WREF organic layer samples" | Site == "WREF mineral layer samples" )
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF site samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("CoA-Layer", "CoA-Site"))Set1 <- NEON_MAGs_WREF |>
filter(`Assembly Type` == "Individual")
Set2 <- NEON_MAGs_WREF |>
filter(Site == "WREF organic layer samples" | Site == "WREF mineral layer samples" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA-Layer"))Set1 <- NEON_MAGs_87 |>
filter(`Assembly Type` == "Individual")
Set2 <- NEON_MAGs_87 |>
filter(`Sample Name` == "NEON combined assembly" )
x <- list(Set1$`GTDB Taxonomy Lineage`, Set2$`GTDB Taxonomy Lineage` )
ggVennDiagram(x, category.names = c("Ind", "CoA"))level_order <- c("WREF site samples", "WREF organic layer samples", "WREF mineral layer samples", "WREF_001-O-20210621", "WREF plot 73 samples", "WREF_073-O-20210623", "WREF_073-M-20210623", "WREF plot 4 samples", "WREF_004-O-20210622", "WREF_004-M-20210622", "WREF plot 3 samples", "WREF_003-O-20210622", "WREF_003-M-20210622")
NEON_MAGs_WREF |>
group_by(`Sample Name`, `GTDB Taxonomy Lineage`, Phylum) |>
summarise(n = n()) |>
ggplot(aes(x = factor(`Sample Name`, level = level_order), y = `GTDB Taxonomy Lineage`, size = n, color = Phylum)) +
geom_point() +
theme(
axis.text.x = element_text(angle = 45, hjust=1))level_order <- c("WREF site samples", "WREF organic layer samples", "WREF_001-O-20210621", "WREF_003-O-20210622", "WREF_004-O-20210622", "WREF_073-O-20210623", "WREF mineral layer samples", "WREF_003-M-20210622", "WREF_004-M-20210622", "WREF_073-M-20210623")
NEON_MAGs_WREF |>
filter(`Assembly Type` == "Individual" | Site == "WREF organic layer samples" | Site == "WREF mineral layer samples" | Site == "WREF site samples") |>
group_by(`Sample Name`, `GTDB Taxonomy Lineage`, Phylum) |>
summarise(n = n()) |>
ggplot(aes(x = factor(`Sample Name`, level = level_order), y = `GTDB Taxonomy Lineage`, size = n, color = Phylum)) +
geom_point() +
theme(
axis.text.x = element_text(angle = 45, hjust=1))NEON_MAGs_WREF %>%
filter(Site == "WREF organic layer samples" | Site == "WREF mineral layer samples" | Site == "WREF site samples" )|>
ggplot(aes(x = `Bin Completeness`, y = `Scaffold Count`, shape = `Bin Quality`, color = `Assembly Method`)) +
geom_point() +
labs(title = "Relationship between MAGs completenes and the number of scaffolds", x = "MAG completeness", y = "Number of Scaffolds in the MAG") NEON_MAGs_WREF %>%
filter(Site == "WREF organic layer samples" | `Assembly Type` == "Individual" )|>
ggplot(aes(x = `Bin Completeness`, y = `Scaffold Count`, shape = `Bin Quality`, color = `Assembly Type`)) +
geom_point() +
labs(title = "Relationship between MAGs completenes and the number of scaffolds", x = "MAG completeness", y = "Number of Scaffolds in the MAG") NEON_MAGs_WREF %>%
filter(Site == "WREF organic layer samples" | Site == "WREF mineral layer samples"| `Assembly Type` == "Individual" )|>
ggplot(aes(x = `Bin Completeness`, y = `Scaffold Count`, shape = `Bin Quality`, color = `Assembly Type`)) +
geom_point() +
labs(title = "Relationship between MAGs completenes and the number of scaffolds", x = "MAG completeness", y = "Number of Scaffolds in the MAG") NEON_MAGs_87 %>%
ggplot(aes(x = `Bin Completeness`, y = `Scaffold Count`, shape = `Bin Quality`, color = `Assembly Type`)) +
geom_point() +
labs(title = "Relationship between MAGs completenes and the number of scaffolds", x = "MAG completeness", y = "Number of Scaffolds in the MAG")